## Accessing the Files
#You can access the code and data files for this project on GitHub:
#- [GitHub Repository](https://github.com/AtodariaTMU/Toronto-Housing-Market)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Read the CSV file and store it in a DataFrame
file_path = 'mls.csv'
df = pd.read_csv(file_path)
def check_df(dataframe, head=5):
print(" SHAPE ".center(70, '#'))
print('Rows: {}'.format(dataframe.shape[0]))
print('Columns: {}'.format(dataframe.shape[1]))
print(" INFO ".center(70, '#'))
print(dataframe.info())
print(" MISSING VALUES ".center(70, '#'))
print(dataframe.isnull().sum())
print(" DUPLICATED VALUES ".center(70, '#'))
print(dataframe.duplicated().sum())
print(" DESCRIBE ".center(70, '#'))
print(dataframe.describe().T)
check_df(df)
############################### SHAPE ################################
Rows: 5091
Columns: 17
################################ INFO ################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5091 entries, 0 to 5090
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Location 5091 non-null object
1 CompIndex 5076 non-null float64
2 CompBenchmark 5076 non-null float64
3 CompYoYChange 5076 non-null float64
4 SFDetachIndex 5076 non-null float64
5 SFDetachBenchmark 5076 non-null float64
6 SFDetachYoYChange 5075 non-null float64
7 SFAttachIndex 4949 non-null float64
8 SFAttachBenchmark 4949 non-null float64
9 SFAttachYoYChange 4951 non-null float64
10 THouseIndex 3803 non-null float64
11 THouseBenchmark 3803 non-null float64
12 THouseYoYChange 3804 non-null float64
13 ApartIndex 4010 non-null float64
14 ApartBenchmark 4010 non-null float64
15 ApartYoYChange 4008 non-null float64
16 Date 5091 non-null object
dtypes: float64(15), object(2)
memory usage: 676.3+ KB
None
########################### MISSING VALUES ###########################
Location 0
CompIndex 15
CompBenchmark 15
CompYoYChange 15
SFDetachIndex 15
SFDetachBenchmark 15
SFDetachYoYChange 16
SFAttachIndex 142
SFAttachBenchmark 142
SFAttachYoYChange 140
THouseIndex 1288
THouseBenchmark 1288
THouseYoYChange 1287
ApartIndex 1081
ApartBenchmark 1081
ApartYoYChange 1083
Date 0
dtype: int64
######################### DUPLICATED VALUES ##########################
62
############################## DESCRIBE ##############################
count mean std min \
CompIndex 5076.0 244.871592 41.374120 135.00
CompBenchmark 5076.0 772049.881797 272588.169295 257000.00
CompYoYChange 5076.0 10.222695 9.186082 -19.58
SFDetachIndex 5076.0 246.021277 41.829704 126.00
SFDetachBenchmark 5076.0 948100.059102 376586.547562 259100.00
SFDetachYoYChange 5075.0 9.329878 10.411804 -21.32
SFAttachIndex 4949.0 247.290139 43.124667 137.20
SFAttachBenchmark 4949.0 718284.198828 243377.690928 273300.00
SFAttachYoYChange 4951.0 9.311238 10.105563 -100.00
THouseIndex 3803.0 245.168393 55.364090 0.00
THouseBenchmark 3803.0 582943.965291 221250.438808 0.00
THouseYoYChange 3804.0 9.466312 14.473397 -100.00
ApartIndex 4010.0 236.428279 52.340839 106.00
ApartBenchmark 4010.0 483821.970075 145759.334129 171400.00
ApartYoYChange 4008.0 11.521093 9.079515 -10.60
25% 50% 75% max
CompIndex 217.600 247.10 2.705000e+02 409.90
CompBenchmark 589000.000 726500.00 9.033000e+05 2162900.00
CompYoYChange 4.380 9.39 1.538250e+01 46.78
SFDetachIndex 220.300 246.70 2.720000e+02 415.60
SFDetachBenchmark 682200.000 873450.00 1.137150e+06 2536900.00
SFDetachYoYChange 2.245 8.54 1.571500e+01 49.10
SFAttachIndex 220.700 246.70 2.699000e+02 709.00
SFAttachBenchmark 542900.000 671600.00 8.399000e+05 1677200.00
SFAttachYoYChange 2.715 8.76 1.537000e+01 43.71
THouseIndex 210.500 246.40 2.783000e+02 466.80
THouseBenchmark 439450.000 555000.00 6.731000e+05 1750500.00
THouseYoYChange 4.080 9.63 1.560250e+01 56.49
ApartIndex 196.625 239.60 2.722000e+02 395.30
ApartBenchmark 381925.000 477600.00 5.692500e+05 1005500.00
ApartYoYChange 5.640 9.74 1.529000e+01 64.32
# 1. Handle missing values
# Drop missing value fill with mean, median, or mode)
df.dropna(inplace=True)
# 2. Convert data types if necessary
# In this case, we will convert the 'Date' column to a datetime object
df['Date'] = pd.to_datetime(df['Date'])
# 3. Handle outliers (example: IQR method for 'CompIndex')
Q1 = df['CompIndex'].quantile(0.25)
Q3 = df['CompIndex'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Remove outliers from the dataset
df = df[(df['CompIndex'] >= lower_bound) & (df['CompIndex'] <= upper_bound)]
print(df)
# 4. Normalize or scale features (if necessary)
# we will normalize the 'CompIndex' column using Min-Max normalization
from sklearn.preprocessing import MinMaxScaler
# Select the feature(s) to normalize
feature_to_normalize = ['CompIndex']
# Instantiate the scaler object
scaler = MinMaxScaler()
# Fit and transform the data
df[feature_to_normalize] = scaler.fit_transform(df[feature_to_normalize])
Location CompIndex CompBenchmark CompYoYChange SFDetachIndex \
69 Ajax 179.8 444300.0 16.00 178.8
70 Ajax 180.4 445800.0 16.31 179.2
71 Ajax 182.8 451700.0 15.55 181.4
72 Ajax 183.8 454200.0 15.74 182.3
73 Ajax 182.1 450000.0 12.69 180.3
... ... ... ... ... ...
5086 York Region 287.0 970300.0 11.76 292.0
5087 York Region 289.5 978700.0 12.56 295.2
5088 York Region 294.9 997000.0 13.34 301.7
5089 York Region 309.9 1047700.0 17.16 318.6
5090 York Region 324.1 1095700.0 19.82 334.4
SFDetachBenchmark SFDetachYoYChange SFAttachIndex SFAttachBenchmark \
69 478900.0 17.86 187.1 402400.0
70 480000.0 15.76 187.6 403500.0
71 485800.0 15.10 190.1 408800.0
72 488300.0 14.51 191.2 411200.0
73 482900.0 11.99 190.3 409300.0
... ... ... ... ...
5086 1112000.0 12.65 293.2 847300.0
5087 1124200.0 13.89 296.3 856200.0
5088 1148900.0 15.20 302.9 875300.0
5089 1213300.0 20.23 322.6 932200.0
5090 1273400.0 23.49 337.2 974400.0
SFAttachYoYChange THouseIndex THouseBenchmark THouseYoYChange \
69 15.64 170.5 311600.0 18.16
70 16.81 170.4 311400.0 15.76
71 15.99 172.3 314900.0 14.33
72 15.95 177.1 323700.0 17.13
73 13.14 173.1 316300.0 12.99
... ... ... ... ...
5086 12.73 271.3 704100.0 11.28
5087 13.66 270.4 701800.0 10.28
5088 14.78 272.6 707500.0 10.14
5089 20.15 280.7 728500.0 11.04
5090 22.48 294.0 763000.0 14.75
ApartIndex ApartBenchmark ApartYoYChange Date
69 148.7 252500.0 5.61 2015-07-01
70 151.5 257200.0 8.99 2015-08-01
71 154.3 262000.0 10.53 2015-09-01
72 152.4 258800.0 8.70 2015-10-01
73 151.2 256700.0 5.00 2015-11-01
... ... ... ... ...
5086 252.2 562700.0 6.91 2020-11-01
5087 253.2 564900.0 6.84 2020-12-01
5088 254.7 568200.0 5.86 2021-01-01
5089 260.2 580500.0 5.39 2021-02-01
5090 268.7 599500.0 6.00 2021-03-01
[3648 rows x 17 columns]
#Import our housing data into Pandas
HousingData = pd.read_csv("MLS.csv")
print(HousingData.shape)
HousingData.head()
print(df)
HousingData.isna().sum()
(5091, 17)
Location CompIndex CompBenchmark CompYoYChange SFDetachIndex \
69 Ajax 0.175214 444300.0 16.00 178.8
70 Ajax 0.178063 445800.0 16.31 179.2
71 Ajax 0.189459 451700.0 15.55 181.4
72 Ajax 0.194207 454200.0 15.74 182.3
73 Ajax 0.186135 450000.0 12.69 180.3
... ... ... ... ... ...
5086 York Region 0.684236 970300.0 11.76 292.0
5087 York Region 0.696106 978700.0 12.56 295.2
5088 York Region 0.721747 997000.0 13.34 301.7
5089 York Region 0.792972 1047700.0 17.16 318.6
5090 York Region 0.860399 1095700.0 19.82 334.4
SFDetachBenchmark SFDetachYoYChange SFAttachIndex SFAttachBenchmark \
69 478900.0 17.86 187.1 402400.0
70 480000.0 15.76 187.6 403500.0
71 485800.0 15.10 190.1 408800.0
72 488300.0 14.51 191.2 411200.0
73 482900.0 11.99 190.3 409300.0
... ... ... ... ...
5086 1112000.0 12.65 293.2 847300.0
5087 1124200.0 13.89 296.3 856200.0
5088 1148900.0 15.20 302.9 875300.0
5089 1213300.0 20.23 322.6 932200.0
5090 1273400.0 23.49 337.2 974400.0
SFAttachYoYChange THouseIndex THouseBenchmark THouseYoYChange \
69 15.64 170.5 311600.0 18.16
70 16.81 170.4 311400.0 15.76
71 15.99 172.3 314900.0 14.33
72 15.95 177.1 323700.0 17.13
73 13.14 173.1 316300.0 12.99
... ... ... ... ...
5086 12.73 271.3 704100.0 11.28
5087 13.66 270.4 701800.0 10.28
5088 14.78 272.6 707500.0 10.14
5089 20.15 280.7 728500.0 11.04
5090 22.48 294.0 763000.0 14.75
ApartIndex ApartBenchmark ApartYoYChange Date
69 148.7 252500.0 5.61 2015-07-01
70 151.5 257200.0 8.99 2015-08-01
71 154.3 262000.0 10.53 2015-09-01
72 152.4 258800.0 8.70 2015-10-01
73 151.2 256700.0 5.00 2015-11-01
... ... ... ... ...
5086 252.2 562700.0 6.91 2020-11-01
5087 253.2 564900.0 6.84 2020-12-01
5088 254.7 568200.0 5.86 2021-01-01
5089 260.2 580500.0 5.39 2021-02-01
5090 268.7 599500.0 6.00 2021-03-01
[3648 rows x 17 columns]
Location 0 CompIndex 15 CompBenchmark 15 CompYoYChange 15 SFDetachIndex 15 SFDetachBenchmark 15 SFDetachYoYChange 16 SFAttachIndex 142 SFAttachBenchmark 142 SFAttachYoYChange 140 THouseIndex 1288 THouseBenchmark 1288 THouseYoYChange 1287 ApartIndex 1081 ApartBenchmark 1081 ApartYoYChange 1083 Date 0 dtype: int64
HousingData.Location.unique()
array(['Adjala-Tosorontio', 'Ajax', 'Aurora', 'Barrie', 'Bradford West',
'Bradford West Gwillimbury', 'Brampton', 'Brock', 'Burlington',
'Caledon', 'City of Toronto', 'Clarington', 'Dufferin County',
'Durham Region', 'EGswsiallimbury', 'East Gwillimbury', 'Essa',
'GEswsiallimbury', 'Georgina', 'Halton Hills', 'Halton Region',
'Innisfil', 'King', 'Markham', 'Milton', 'Mississauga',
'New Tecumseth', 'Newmarket', 'Oakville', 'Orangeville', 'Oshawa',
'Peel Region', 'Pickering', 'Richmond Hill', 'Scugog',
'Simcoe County', 'TREB Total', 'Toronto C01', 'Toronto C02',
'Toronto C03', 'Toronto C04', 'Toronto C06', 'Toronto C07',
'Toronto C08', 'Toronto C09', 'Toronto C10', 'Toronto C11',
'Toronto C12', 'Toronto C13', 'Toronto C14', 'Toronto C15',
'Toronto E01', 'Toronto E02', 'Toronto E03', 'Toronto E04',
'Toronto E05', 'Toronto E06', 'Toronto E07', 'Toronto E08',
'Toronto E09', 'Toronto E10', 'Toronto E11', 'Toronto W01',
'Toronto W02', 'Toronto W03', 'Toronto W04', 'Toronto W05',
'Toronto W06', 'Toronto W07', 'Toronto W08', 'Toronto W09',
'Toronto W10', 'Uxbridge', 'Vaughan', 'Whitby',
'Whitchurch-Stouffville', 'York Region'], dtype=object)
# Convert Date into Pandas Date Object
HousingData['Date'] = pd.to_datetime(HousingData['Date'], format='%Y-%m')
# Generate summary statistics
summary_stats = df.describe()
print(summary_stats)
CompIndex CompBenchmark CompYoYChange SFDetachIndex \
count 3648.000000 3.648000e+03 3648.000000 3648.000000
mean 0.500226 7.977442e+05 9.807256 249.776563
std 0.185440 2.619130e+05 8.428596 39.578971
min 0.000000 3.099000e+05 -19.580000 126.300000
25% 0.374644 6.221000e+05 4.637500 228.000000
50% 0.514245 7.523500e+05 9.160000 250.800000
75% 0.622507 9.167000e+05 14.252500 275.625000
max 1.000000 2.162900e+06 38.990000 380.500000
SFDetachBenchmark SFDetachYoYChange SFAttachIndex SFAttachBenchmark \
count 3.648000e+03 3648.000000 3648.000000 3.648000e+03
mean 1.024441e+06 8.573712 248.344079 7.660010e+05
std 3.636600e+05 9.994362 39.055297 2.322276e+05
min 3.455000e+05 -21.320000 137.200000 2.868000e+05
25% 7.684500e+05 2.097500 222.900000 6.047250e+05
50% 9.478000e+05 8.020000 248.750000 7.210000e+05
75% 1.206825e+06 14.100000 271.800000 8.791000e+05
max 2.536900e+06 45.690000 385.400000 1.677200e+06
SFAttachYoYChange THouseIndex THouseBenchmark THouseYoYChange \
count 3648.000000 3648.000000 3.648000e+03 3648.000000
mean 8.562283 245.427001 5.863073e+05 9.374052
std 9.451189 53.697026 2.220564e+05 14.395304
min -21.470000 0.000000 0.000000e+00 -100.000000
25% 2.457500 212.550000 4.455000e+05 4.000000
50% 8.145000 247.300000 5.587000e+05 9.630000
75% 14.212500 278.900000 6.735000e+05 15.470000
max 38.700000 407.600000 1.750500e+06 56.490000
ApartIndex ApartBenchmark ApartYoYChange
count 3648.000000 3648.000000 3648.000000
mean 235.345011 473784.539474 11.772366
std 51.219040 137700.373162 9.179031
min 106.000000 171400.000000 -7.450000
25% 195.075000 376950.000000 5.807500
50% 239.500000 469350.000000 9.950000
75% 271.800000 555025.000000 15.685000
max 395.300000 975100.000000 64.320000
# Compute the correlation matrix
correlation_matrix = df.corr()
# Display the correlation matrix
print(correlation_matrix)
CompIndex CompBenchmark CompYoYChange SFDetachIndex \
CompIndex 1.000000 0.324620 -0.016030 0.854821
CompBenchmark 0.324620 1.000000 -0.107419 0.156587
CompYoYChange -0.016030 -0.107419 1.000000 0.091615
SFDetachIndex 0.854821 0.156587 0.091615 1.000000
SFDetachBenchmark 0.232403 0.864551 -0.050001 0.183471
SFDetachYoYChange -0.033444 -0.104609 0.903738 0.112949
SFAttachIndex 0.869300 0.317627 0.092676 0.796458
SFAttachBenchmark 0.327853 0.744917 -0.030826 0.218165
SFAttachYoYChange -0.004149 -0.099601 0.885237 0.098457
THouseIndex 0.692637 0.236967 -0.037960 0.573138
THouseBenchmark 0.243727 0.626291 -0.073826 0.139932
THouseYoYChange -0.002934 -0.109147 0.574179 0.058187
ApartIndex 0.843490 0.337859 -0.212718 0.569088
ApartBenchmark 0.550833 0.798945 -0.234686 0.365542
ApartYoYChange 0.051194 -0.084285 0.602743 0.116215
SFDetachBenchmark SFDetachYoYChange SFAttachIndex \
CompIndex 0.232403 -0.033444 0.869300
CompBenchmark 0.864551 -0.104609 0.317627
CompYoYChange -0.050001 0.903738 0.092676
SFDetachIndex 0.183471 0.112949 0.796458
SFDetachBenchmark 1.000000 -0.062769 0.213111
SFDetachYoYChange -0.062769 1.000000 0.128044
SFAttachIndex 0.213111 0.128044 1.000000
SFAttachBenchmark 0.896177 -0.043014 0.322640
SFAttachYoYChange -0.098351 0.961799 0.172743
THouseIndex 0.224219 -0.068398 0.600273
THouseBenchmark 0.689164 -0.096562 0.157458
THouseYoYChange -0.050869 0.494597 0.065241
ApartIndex 0.268881 -0.272095 0.658038
ApartBenchmark 0.724441 -0.251923 0.469663
ApartYoYChange -0.041487 0.327000 0.063699
SFAttachBenchmark SFAttachYoYChange THouseIndex \
CompIndex 0.327853 -0.004149 0.692637
CompBenchmark 0.744917 -0.099601 0.236967
CompYoYChange -0.030826 0.885237 -0.037960
SFDetachIndex 0.218165 0.098457 0.573138
SFDetachBenchmark 0.896177 -0.098351 0.224219
SFDetachYoYChange -0.043014 0.961799 -0.068398
SFAttachIndex 0.322640 0.172743 0.600273
SFAttachBenchmark 1.000000 -0.067719 0.294303
SFAttachYoYChange -0.067719 1.000000 -0.038499
THouseIndex 0.294303 -0.038499 1.000000
THouseBenchmark 0.708432 -0.115573 0.544215
THouseYoYChange -0.063878 0.500150 0.367768
ApartIndex 0.338733 -0.222036 0.644745
ApartBenchmark 0.746322 -0.238097 0.457065
ApartYoYChange -0.044532 0.318344 0.028606
THouseBenchmark THouseYoYChange ApartIndex \
CompIndex 0.243727 -0.002934 0.843490
CompBenchmark 0.626291 -0.109147 0.337859
CompYoYChange -0.073826 0.574179 -0.212718
SFDetachIndex 0.139932 0.058187 0.569088
SFDetachBenchmark 0.689164 -0.050869 0.268881
SFDetachYoYChange -0.096562 0.494597 -0.272095
SFAttachIndex 0.157458 0.065241 0.658038
SFAttachBenchmark 0.708432 -0.063878 0.338733
SFAttachYoYChange -0.115573 0.500150 -0.222036
THouseIndex 0.544215 0.367768 0.644745
THouseBenchmark 1.000000 0.169816 0.277978
THouseYoYChange 0.169816 1.000000 -0.090664
ApartIndex 0.277978 -0.090664 1.000000
ApartBenchmark 0.680096 -0.171528 0.595459
ApartYoYChange -0.054569 0.406512 0.008231
ApartBenchmark ApartYoYChange
CompIndex 0.550833 0.051194
CompBenchmark 0.798945 -0.084285
CompYoYChange -0.234686 0.602743
SFDetachIndex 0.365542 0.116215
SFDetachBenchmark 0.724441 -0.041487
SFDetachYoYChange -0.251923 0.327000
SFAttachIndex 0.469663 0.063699
SFAttachBenchmark 0.746322 -0.044532
SFAttachYoYChange -0.238097 0.318344
THouseIndex 0.457065 0.028606
THouseBenchmark 0.680096 -0.054569
THouseYoYChange -0.171528 0.406512
ApartIndex 0.595459 0.008231
ApartBenchmark 1.000000 -0.132643
ApartYoYChange -0.132643 1.000000
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
#How has the average price of a home in Toronto changed over the past decade?
#Visualization: Time series plot
#Variables: Date (x-axis) and CompBenchmark or a specific property type benchmark (y-axis)
import pandas as pd
import matplotlib.pyplot as plt
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Sort the data so it cannot be overlapped
data = data.sort_values('Date')
# Create the time series plot
plt.figure(figsize=(10, 5))
plt.plot(data['Date'], data['CompBenchmark'])
# Set plot title and labels
plt.title('Average Home Price in Toronto (2011-2021)')
plt.xlabel('Year')
plt.ylabel('Composite Benchmark Price')
# Show the plot
plt.show()
import plotly.graph_objects as go
import pandas as pd
# Load data
df = pd.read_csv('mls.csv')
# Create figure
fig = go.Figure()
# Iterate over each neighborhood and add a trace for each
for neighborhood in df['Location'].unique():
neighborhood_df = df[df['Location'] == neighborhood].sort_values('Date')
fig.add_trace(
go.Scatter(
x=neighborhood_df['Date'],
y=neighborhood_df['CompBenchmark'],
name=neighborhood,
mode='lines',
)
)
# Add layout options
fig.update_layout(
title='Average Housing Prices in Toronto by Neighborhood over Time',
xaxis_title='Year',
yaxis_title='Price',
hovermode='x',
updatemenus=[dict(
type='buttons',
showactive=False,
buttons=[dict(
label='Play',
method='animate',
args=[None, dict(frame=dict(duration=500), fromcurrent=True)]
),
dict(
label='Pause',
method='animate',
args=[[None], dict(frame=dict(duration=0), mode='immediate', transition=dict(duration=0))]
)]
)]
)
# Define frames for animation
frames = [go.Frame(data=[go.Scatter(
x=df[df['Location'] == neighborhood]['Date'].unique()[:i+1],
y=df[df['Location'] == neighborhood]['CompBenchmark'].unique()[:i+1],
name=neighborhood,
mode='lines',
)],
name=str(i)) for i in range(len(df['Date'].unique()))]
# Add frames to the figure
fig.frames = frames
# Show the figure
fig.show()
import pandas as pd
import plotly.figure_factory as ff
# Read the CSV file
data = pd.read_csv('mls.csv')
# Remove missing values
data = data.dropna()
# Filter data by type of home
detach_data = data['SFDetachBenchmark']
attach_data = data['SFAttachBenchmark']
th_data = data['THouseBenchmark']
apart_data = data['ApartBenchmark']
# Create hist_data list
hist_data = [detach_data, attach_data, th_data, apart_data]
# Define group labels and colors
group_labels = ['Detached', 'Attached', 'Townhouse', 'Apartment']
colors = ['#0000FF', '#FFA500', '#00FF00', '#FF0000']
# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
bin_size=50000, show_rug=False)
# Add title
fig.update_layout(title_text='Distribution of Average Home Prices by Type in Toronto (2011-2021)')
fig.show()
#Variables: Date (x-axis) and CompBenchmark or a specific property type benchmark (y-axis)
#CompBenchmark stands for Composite Benchmark Price. It is a weighted average of the benchmark prices
#for all property types in the dataset (Single-Family Detached, Single-Family Attached, Townhouse, and Apartment).
#The composite benchmark price provides a comprehensive view of the overall housing market by taking into account
#the price trends across different property types.
#By analyzing the Composite Benchmark Price over time, you can get a sense of how the average price
#of a home in a specific area has changed.
#2 Are there any patterns or trends in the housing market that can be observed over time?
#Visualization: Multiple time series plots on the same graph for different property types
#Variables: Date (x-axis), SFDetachBenchmark, SFAttachBenchmark, THouseBenchmark, and ApartBenchmark (all on the y-axis)
#Some potential factors that can cause patterns or trends in the housing market include:
#Economic factors: The performance of the overall economy can greatly impact the housing market.
#Economic growth, employment rates, and income levels can affect the demand for housing and lead to price fluctuations.
#Population growth: An increase in population can create higher demand for housing,
#pushing up prices. Conversely, a decline in population or slow population growth may lead to a decrease in demand and lower prices.
#Interest rates: Changes in interest rates can affect housing affordability and demand.
#Lower interest rates can make mortgages more affordable, leading to an increase in demand
#and higher prices. Higher interest rates can have the opposite effect.
#Government policies: Government policies, such as tax incentives or subsidies for homebuyers,
#can influence the housing market by affecting demand and affordability. Changes in zoning regulations,
#rent controls, and affordable housing policies can also impact housing prices.
#Supply of housing: The availability of housing, including new constructions and existing homes,
#can influence housing prices. If the supply is limited or does not keep up with demand, prices may rise.
#An oversupply of housing can lead to lower prices.
#Location and neighborhood factors: The desirability of a particular location, such as proximity to
#employment centers, schools, public transportation, and amenities, can affect housing prices. Additionally,
#neighborhood characteristics, such as safety and quality of life, can influence demand and prices.
#External events: Events such as natural disasters, pandemics, or political instability can have an
#impact on the housing market by affecting demand, supply, or both.
#To determine the specific causes of observed patterns or trends in your data, it's important
#to perform a more detailed analysis, which may include investigating correlations between housing
#prices and various factors, as well as researching historical events and policies that may have
#influenced the housing market in Toronto.
import pandas as pd
import matplotlib.pyplot as plt
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Sort the data by date
data = data.sort_values('Date')
# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
fig.tight_layout(pad=5)
# Plot Single-Family Detached benchmark prices
axes[0, 0].plot(data['Date'], data['SFDetachBenchmark'])
axes[0, 0].set_title('Single-Family Detached')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Benchmark Price')
# Plot Single-Family Attached benchmark prices
axes[0, 1].plot(data['Date'], data['SFAttachBenchmark'])
axes[0, 1].set_title('Single-Family Attached')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Benchmark Price')
# Plot Townhouse benchmark prices
axes[1, 0].plot(data['Date'], data['THouseBenchmark'])
axes[1, 0].set_title('Townhouse')
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Benchmark Price')
# Plot Apartment benchmark prices
axes[1, 1].plot(data['Date'], data['ApartBenchmark'])
axes[1, 1].set_title('Apartment')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Benchmark Price')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Sort the data by date
data = data.sort_values('Date')
# Create the time series plot with multiple lines
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['SFDetachBenchmark'], label='Single-Family Detached')
plt.plot(data['Date'], data['SFAttachBenchmark'], label='Single-Family Attached')
plt.plot(data['Date'], data['THouseBenchmark'], label='Townhouse')
plt.plot(data['Date'], data['ApartBenchmark'], label='Apartment')
# Set plot title and labels
plt.title('Housing Market Trends in Toronto (2011-2021)')
plt.xlabel('Year')
plt.ylabel('Benchmark Price')
# Add a legend to the plot
plt.legend()
# Show the plot
plt.show()
import pandas as pd
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Find the highest benchmark prices and corresponding dates for each property type
max_sf_detach = data.loc[data['SFDetachBenchmark'].idxmax()]
max_sf_attach = data.loc[data['SFAttachBenchmark'].idxmax()]
max_t_house = data.loc[data['THouseBenchmark'].idxmax()]
max_apart = data.loc[data['ApartBenchmark'].idxmax()]
# Print the results
print("Single-Family Detached:")
print("Highest Price: ", max_sf_detach['SFDetachBenchmark'])
print("Date: ", max_sf_detach['Date'])
print("\nSingle-Family Attached:")
print("Highest Price: ", max_sf_attach['SFAttachBenchmark'])
print("Date: ", max_sf_attach['Date'])
print("\nTownhouse:")
print("Highest Price: ", max_t_house['THouseBenchmark'])
print("Date: ", max_t_house['Date'])
print("\nApartment:")
print("Highest Price: ", max_apart['ApartBenchmark'])
print("Date: ", max_apart['Date'])
Single-Family Detached: Highest Price: 2536900.0 Date: 2017-04-01 00:00:00 Single-Family Attached: Highest Price: 1677200.0 Date: 2021-03-01 00:00:00 Townhouse: Highest Price: 1750500.0 Date: 2020-10-01 00:00:00 Apartment: Highest Price: 1005500.0 Date: 2021-02-01 00:00:00
import pandas as pd
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Find the lowest benchmark prices and corresponding dates for each property type
min_sf_detach = data.loc[data['SFDetachBenchmark'].idxmin()]
min_sf_attach = data.loc[data['SFAttachBenchmark'].idxmin()]
min_t_house = data.loc[data['THouseBenchmark'].idxmin()]
min_apart = data.loc[data['ApartBenchmark'].idxmin()]
# Find the most steady prices by calculating the standard deviation for each property type
std_sf_detach = data['SFDetachBenchmark'].std()
std_sf_attach = data['SFAttachBenchmark'].std()
std_t_house = data['THouseBenchmark'].std()
std_apart = data['ApartBenchmark'].std()
# Print the lowest prices
print("Lowest Prices:")
print("\nSingle-Family Detached:")
print("Lowest Price: ", min_sf_detach['SFDetachBenchmark'])
print("Date: ", min_sf_detach['Date'])
print("\nSingle-Family Attached:")
print("Lowest Price: ", min_sf_attach['SFAttachBenchmark'])
print("Date: ", min_sf_attach['Date'])
print("\nTownhouse:")
print("Lowest Price: ", min_t_house['THouseBenchmark'])
print("Date: ", min_t_house['Date'])
print("\nApartment:")
print("Lowest Price: ", min_apart['ApartBenchmark'])
print("Date: ", min_apart['Date'])
# Print the most steady prices
print("\nMost Steady Prices (Standard Deviation):")
print("\nSingle-Family Detached: ", std_sf_detach)
print("Single-Family Attached: ", std_sf_attach)
print("Townhouse: ", std_t_house)
print("Apartment: ", std_apart)
Lowest Prices: Single-Family Detached: Lowest Price: 259100.0 Date: 2015-07-01 00:00:00 Single-Family Attached: Lowest Price: 273300.0 Date: 2015-07-01 00:00:00 Townhouse: Lowest Price: 0.0 Date: 2018-08-01 00:00:00 Apartment: Lowest Price: 171400.0 Date: 2015-10-01 00:00:00 Most Steady Prices (Standard Deviation): Single-Family Detached: 376586.54756233987 Single-Family Attached: 243377.69092803219 Townhouse: 221250.43880756843 Apartment: 145759.334129126
3# How do different neighbourhoods in Toronto compare in terms of housing prices and trends?
#first i have created world cloud for popular neighbourhood around toronto area.
#By looking at the box plot, we can gain several insights about the Toronto housing market:
#There is a wide range of housing prices across different neighborhoods in Toronto. Some neighborhoods have higher
#housing prices than others, which suggests that the demand for housing varies depending on the location.
##There are some neighborhoods with a high degree of variability in housing prices, as indicated by the length
#of the box plot whiskers. This could suggest that the housing market in these neighborhoods is more volatile or
#subject to greater fluctuations.
#There are several outliers in the box plot, which represent individual data points that fall outside of the
#range of typical values for that neighborhood. These outliers could be due to a variety of factors, such as
#unique features or amenities of a property, or other market conditions that are not representative of
#the overall trend in that neighborhood.
#Overall, this box plot provides a useful visualization of the distribution of housing prices
#across different neighborhoods in Toronto, and can help us to better understand the factors that
#drive variation in the Toronto housing market.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Create a string of all neighborhood names
neighborhoods = ' '.join(data['Location'].unique())
# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=STOPWORDS).generate(neighborhoods)
# Display the word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Neighborhoods in Toronto')
plt.show()
import pandas as pd
import folium
import geocoder
# Function to get latitude and longitude for a given neighborhood
def get_lat_lng(neighborhood):
location = geocoder.arcgis(f'{neighborhood}, Toronto, Ontario')
return location.latlng
# Read the CSV file and drop missing values
data = pd.read_csv("mls.csv")
clean_data = data.dropna()
# Calculate the average benchmark values for each neighborhood
neighborhood_data = clean_data.groupby("Location").mean().reset_index()
# Add latitude and longitude to the neighborhood_data DataFrame
neighborhood_data[["Latitude", "Longitude"]] = pd.DataFrame(
neighborhood_data["Location"].apply(get_lat_lng).tolist(),
columns=["Latitude", "Longitude"]
)
# Create the map centered on Toronto
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)
# Define the maximum and minimum benchmark values for scaling the marker sizes and colors
max_value = neighborhood_data['SFDetachBenchmark'].max()
min_value = neighborhood_data['SFDetachBenchmark'].min()
# Add circular markers to the map based on the benchmark values
for index, row in neighborhood_data.iterrows():
# Calculate the marker size and color based on the benchmark value
marker_size = (row['SFDetachBenchmark'] - min_value) / (max_value - min_value) * 100
marker_color = 'red' if row['SFDetachBenchmark'] == max_value else 'green'
# Create the marker and add it to the map
folium.CircleMarker(
location=[row["Latitude"], row["Longitude"]],
radius=marker_size,
color=marker_color,
fill=True,
fill_color=marker_color,
fill_opacity=0.7,
popup=f"{row['Location']}: ${row['SFDetachBenchmark']:.2f}"
).add_to(map_toronto)
# Display the map
map_toronto
#The "C" in the neighborhood names in Toronto, such as C12 and C10, stands for "Community". In Toronto,
#neighborhoods are often referred to as communities and are grouped together based on their geographic
#location and other characteristics.
#The City of Toronto uses a system of alphanumeric codes to identify each neighborhood/community in the city.
#The first letter in the code indicates the district (e.g., "C" for the old city of Toronto, "E" for the formercity of East York), and the following two digits represent
#the specific community within that district.
#The "C" district includes the neighborhoods/communities located in the former city of Toronto, which was
#amalgamated with five other municipalities to form the current City of Toronto in 1998. There are a total of
#44 neighborhoods/communities in the "C" district, ranging from C01 to C14. These neighborhoods/communities
#are diverse in terms of demographics, housing types, and amenities, and include some of the most desirable
#and affluent areas in Toronto.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Read the housing data CSV file
data = pd.read_csv('mls.csv')
# Create a box plot for each neighborhood
plt.figure(figsize=(12, 8))
sns.boxplot(data=data, x='Location', y='CompBenchmark')
# Set plot title and axis labels
plt.title('Housing Prices by Neighborhood in Toronto')
plt.xlabel('Neighborhood')
plt.ylabel('CompBenchmark')
# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
# Show the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Read in the housing data CSV file
housing_data = pd.read_csv('mls.csv')
# Remove any missing values
clean_data = housing_data.dropna()
# Select only relevant columns
#clean_data = clean_data[['Location', 'SFDetachBenchmark']]
sorted_neighborhoods = neighborhood_data.sort_values("SFDetachBenchmark", ascending=False)
# Calculate the average benchmark value for single-family detached homes in each neighborhood
neighborhood_data = clean_data.groupby('Location')['SFDetachBenchmark'].mean().reset_index()
# Choose the top N and bottom N neighborhoods
N = 5
top_neighborhoods = sorted_neighborhoods.head(N)["Location"].tolist()
bottom_neighborhoods = sorted_neighborhoods.tail(N)["Location"].tolist()
# Combine the top and bottom neighborhoods
selected_neighborhoods = top_neighborhoods + bottom_neighborhoods
# Filter the clean_data DataFrame to include only the selected neighborhoods
selected_data = clean_data[clean_data["Location"].isin(selected_neighborhoods)]
# Create a box plot comparing the price distributions across the top and bottom neighborhoods
plt.figure(figsize=(12, 6))
sns.boxplot(x="Location", y="SFDetachBenchmark", data=selected_data)
plt.xticks(rotation=45)
plt.xlabel("Location")
plt.ylabel("Home Prices")
plt.title(f"Price Distribution Comparison in Top {N} and Bottom {N} Neighbourhoods")
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read the CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Add columns for year and month
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month_name()
# Create a violin plot for each housing type
sns.set_style('darkgrid')
plt.figure(figsize=(12, 6))
sns.violinplot(data=data, x='Month', y='SFDetachBenchmark', color='lightblue')
sns.violinplot(data=data, x='Month', y='SFAttachBenchmark', color='lightgreen')
sns.violinplot(data=data, x='Month', y='THouseBenchmark', color='orange')
sns.violinplot(data=data, x='Month', y='ApartBenchmark', color='pink')
# Set plot title and labels
plt.title('Monthly Variation in Toronto Housing Prices (2011-2021)', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Benchmark Price', fontsize=12)
# Set the tick labels to show the month names
plt.xticks(ticks=range(12), labels=data['Month'].unique(), fontsize=12)
# Add a legend to the plot
plt.legend(labels=['Single-Family Detached', 'Single-Family Attached', 'Townhouse', 'Apartment'], fontsize=12,loc='upper left')
# Show the plot
plt.show()
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Read the housing data CSV file
data = pd.read_csv('mls.csv')
# Add additional columns representing factors such as economic indicators, population growth, or interest rates
data['EconomicIndicator'] = pd.Series(np.random.randn(len(data['Date'])), index=data.index)
data['PopulationGrowth'] = pd.Series(np.random.randn(len(data['Date'])), index=data.index)
data['InterestRate'] = pd.Series(np.random.randn(len(data['Date'])), index=data.index)
# Calculate the correlation matrix between housing prices and the factors
corr_matrix = data[['CompBenchmark', 'EconomicIndicator', 'PopulationGrowth', 'InterestRate']].corr()
# Create a heatmap to visualize the correlations
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
# Set plot title
plt.title('Correlation Heatmap between Housing Prices and Factors')
# Show the plot
plt.show()
#4 Are there any factors that are driving changes in the Toronto housing market, such as changes in the economy,
#population growth, or policy changes.
#Toronto housing market and using Pandas and Seaborn to create a heatmap that visualizes how different
#industries, as classified by the North American Industry Classification System (NAICS), contribute to
#the housing market in each year.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
file_path = "36100434.csv"
df = pd.read_csv(file_path, low_memory=False)
# Filter the data to include only relevant columns
df = df[['REF_DATE', 'North American Industry Classification System (NAICS)', 'VALUE']]
# Group the data by year and NAICS code, and sum the values for each group
df = df.groupby(['REF_DATE', 'North American Industry Classification System (NAICS)']).sum().reset_index()
# Pivot the data to create a matrix of values for each year and NAICS code
df_pivot = df.pivot(index='REF_DATE', columns='North American Industry Classification System (NAICS)', values='VALUE')
# Create a heatmap
sns.heatmap(df_pivot, cmap='YlGnBu')
# Set the x-label
plt.xlabel('NAICS Code')
# Set the y-label
plt.ylabel('Year')
# Set the title
plt.title('Heatmap of Toronto Housing Market by NAICS Code')
# Show the plot
plt.show()
#5.How has the pandemic effected the Toronto housing market, and what
#are the long-term implications?
#One of the main effects of the pandemic on the Toronto housing market has been an increase in demand
#for larger homes with outdoor space, as people looked to accommodate work-from-home arrangements and
#spend more time at home. This has driven up prices for single-family homes and larger condominium units.
import pandas as pd
import matplotlib.pyplot as plt
# Read the housing data CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Calculate the average home price before the pandemic (2019)
before_pandemic = data[data['Date'].dt.year == 2019]['CompBenchmark'].mean()
# Calculate the average home price during the pandemic (2020)
during_pandemic = data[data['Date'].dt.year == 2020]['CompBenchmark'].mean()
# Calculate the average home price after the pandemic (2021)
after_pandemic = data[data['Date'].dt.year == 2021]['CompBenchmark'].mean()
# Create a bar chart to compare the average home prices
plt.bar(['Before pandemic', 'During pandemic', 'After pandemic'], [before_pandemic, during_pandemic, after_pandemic], yerr=[data[data['Date'].dt.year == 2019]['CompBenchmark'].std(), data[data['Date'].dt.year == 2020]['CompBenchmark'].std(), data[data['Date'].dt.year == 2021]['CompBenchmark'].std()])
# Set plot title and axis labels
plt.title('Average Home Prices in Toronto Before, During, and After the Pandemic')
plt.xlabel('Time period')
plt.ylabel('Average home price')
# Show the plot
plt.show()
import pandas as pd
import plotly.graph_objects as go
# Load housing data
df = pd.read_csv('mls.csv')
# Create a new column to indicate whether the date is within the pandemic period
df['Pandemic'] = df['Date'].apply(lambda x: 'Yes' if x >= '2020-03-01' else 'No')
# Create a stacked area chart
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Date'], y=df['SFDetachBenchmark'], name='Single Family Detached',
stackgroup='one', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df['Date'], y=df['SFAttachBenchmark'], name='Single Family Attached',
stackgroup='one', line=dict(color='orange')))
fig.add_trace(go.Scatter(x=df['Date'], y=df['THouseBenchmark'], name='Townhouse',
stackgroup='one', line=dict(color='green')))
fig.add_trace(go.Scatter(x=df['Date'], y=df['ApartBenchmark'], name='Apartment',
stackgroup='one', line=dict(color='red')))
fig.update_layout(title='Housing Prices by Type Over Time',
xaxis_title='Date', yaxis_title='Price',
hovermode='x unified',
hoverlabel=dict(bgcolor='white', font_size=14))
# Shade the area of the chart during the pandemic period
fig.update_layout(shapes=[dict(type='rect', xref='x', yref='paper', x0='2020-03-01', y0=0, x1='2023-04-02', y1=1,
fillcolor='gray', opacity=0.2, layer='below', line_width=0)])
# Add text to indicate the pandemic period
fig.add_annotation(x='2021-01-01', y=0.95, text='Pandemic Period', showarrow=False,
font=dict(color='black', size=16))
fig.show()
''''this chart shows the trend in housing prices for each type over time,
with the shaded area indicating the period of the pandemic. You can see that
housing prices for all types were on an upward trend prior to the pandemic, but
experienced a temporary dip during the early stages of the pandemic before rebounding
and continuing to rise at an accelerated pace. The chart also shows how the different
types of housing have been affected differently by the pandemic, with single-family
detached homes experiencing the largest price increases.'''
"'this chart shows the trend in housing prices for each type over time, \nwith the shaded area indicating the period of the pandemic. You can see that \nhousing prices for all types were on an upward trend prior to the pandemic, but\nexperienced a temporary dip during the early stages of the pandemic before rebounding\nand continuing to rise at an accelerated pace. The chart also shows how the different\ntypes of housing have been affected differently by the pandemic, with single-family \ndetached homes experiencing the largest price increases."
import pandas as pd
import plotly.graph_objects as go
# Read the housing data CSV file
data = pd.read_csv('mls.csv')
# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])
# Create a time series plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Date'], y=data['CompBenchmark'],
mode='lines',
name='CompBenchmark'))
# Add vertical lines for major pandemic-related events or policy changes
fig.add_shape(type='line',
x0='2020-03-11', x1='2020-03-11',
y0=0, y1=1,
yref='paper',
line=dict(color='red', dash='dash'),
name='WHO declares pandemic')
fig.add_shape(type='line',
x0='2020-03-17', x1='2020-03-17',
y0=0, y1=1,
yref='paper',
line=dict(color='green', dash='dash'),
name='Ontario declares emergency')
fig.add_shape(type='line',
x0='2020-06-17', x1='2020-06-17',
y0=0, y1=1,
yref='paper',
line=dict(color='purple', dash='dash'),
name='Toronto enters Phase 2 reopening')
fig.update_layout(title='Impact of the Pandemic on the Toronto Housing Market',
xaxis_title='Date',
yaxis_title='CompBenchmark',
)
fig.show()